library(data.table) #reading in the data
library(dplyr) #dataframe manipulation
library(ggplot2) #viz
library(ranger) #the random forest implementation
library(plotly) #3D plotting
library(tidyr) #dataframe manipulation
library(FNN) #k nearest neighbors algorithm
library(xgboost)
fb <- fread("~/PycharmProjects/kaggle-project/facebook/train.csv", integer64 = "character", showProgress = FALSE)
nrow(fb)
## [1] 29118021
head(fb, 3)
## row_id x y accuracy time place_id
## 1: 0 0.7941 9.0809 54 470702 8523065625
## 2: 1 5.9567 4.7968 13 186555 1757726713
## 3: 2 8.3078 7.0407 74 322648 1137537235
summary(fb)
## row_id x y accuracy
## Min. : 0 Min. : 0.000 Min. : 0.000 Min. : 1.00
## 1st Qu.: 7279505 1st Qu.: 2.535 1st Qu.: 2.497 1st Qu.: 27.00
## Median :14559010 Median : 5.009 Median : 4.988 Median : 62.00
## Mean :14559010 Mean : 5.000 Mean : 5.002 Mean : 82.85
## 3rd Qu.:21838515 3rd Qu.: 7.461 3rd Qu.: 7.510 3rd Qu.: 75.00
## Max. :29118020 Max. :10.000 Max. :10.000 Max. :1033.00
## time place_id
## Min. : 1 Length:29118021
## 1st Qu.:203057 Class :character
## Median :433922 Mode :character
## Mean :417010
## 3rd Qu.:620491
## Max. :786239
fb %>% filter(x >1, x <1.25, y >2.5, y < 2.75) -> fb_s
nrow(fb_s)
## [1] 17710
fb %>% filter(x >3, x <3.25, y >2.5, y < 2.75) -> fb_s2
nrow(fb_s2)
## [1] 15929
Since target is to classify place_id, 1st to observe place_id From figures: horizon vs continuously increase
par(mfrow=c(3,1))
plot(sort(fb_s$place_id))
plot(sort(fb_s2$place_id))
plot(sort(fb_s2$place_id)[0:2000])
From figures: seem no relation between place_id and accuracy
par(mfrow=c(2,1))
r_pla_aur=sort(fb_s2$place_id,index.return=TRUE)
plot(r_pla_aur$x[0:2000])
d=r_pla_aur$ix[0:2000]
plot(fb_s2[d,"accuracy"])
par(mfrow=c(2,1))
r_pla_aur=sort(fb_s2$place_id,index.return=TRUE)
plot(r_pla_aur$x[500:750])
d=r_pla_aur$ix[500:750]
plot(fb_s2[d,"accuracy"])
fb_s$hour = (fb_s$time/60) %% 24
fb_s$weekday = (fb_s$time/(60*24)) %% 7
fb_s$month = (fb_s$time/(60*24*30)) %% 12 #month-ish
fb_s$year = fb_s$time/(60*24*365)
fb_s$day = fb_s$time/(60*24) %% 365
head(fb_s)
## row_id x y accuracy time place_id hour weekday
## 1 600 1.2214 2.7023 17 65380 6683426742 9.666667 3.4027778
## 2 957 1.1832 2.6891 58 785470 6683426742 11.166667 6.4652778
## 3 4345 1.1935 2.6550 11 400082 6889790653 20.033333 4.8347222
## 4 4735 1.1452 2.6074 49 514983 6822359752 15.050000 0.6270833
## 5 5580 1.0089 2.7287 19 732410 1527921905 14.833333 4.6180556
## 6 6090 1.1140 2.6262 11 145507 4000153867 1.116667 3.0465278
## month year day
## 1 1.513426 0.1243912 189.5072
## 2 6.182176 1.4944254 2276.7246
## 3 9.261157 0.7611910 1159.6580
## 4 11.920903 0.9798002 1492.7043
## 5 4.953935 1.3934741 2122.9275
## 6 3.368218 0.2768398 421.7594
Split data
small_train = fb_s[fb_s$time < 7.3e5,]
small_val = fb_s[fb_s$time >= 7.3e5,]
visualize 2D: small_train
ggplot(small_train, aes(x, y )) +
geom_point(aes(color = place_id)) +
theme_minimal() +
theme(legend.position = "none") +
ggtitle("Check-ins colored by place_id")
Count by place_id
sort((small_train %>% count(place_id))$n, decreasing = T)[0:140]
## [1] 968 848 784 757 651 603 560 512 495 463 378 312 247 243 214 204 199
## [18] 198 179 175 171 167 165 156 155 153 149 134 127 125 124 115 114 113
## [35] 111 106 104 104 96 95 92 89 89 87 87 84 83 82 82 79 79
## [52] 76 72 72 71 70 67 64 61 60 54 54 54 53 53 49 48 47
## [69] 46 46 45 43 42 41 40 38 36 34 34 34 32 32 32 31 30
## [86] 30 30 30 28 27 26 26 26 25 25 24 24 24 23 23 23 22
## [103] 21 21 21 20 20 20 19 19 18 17 15 15 15 15 15 13 13
## [120] 13 13 13 12 12 12 11 11 11 10 10 10 10 10 10 9 9
## [137] 9 9 9 9
visualize 3D: small_train with place_id count>500 z=hour
small_train %>% count(place_id) %>% filter(n > 500) -> ids
#if n>200, warning: n too large, allowed maximum for palette Set2 is 8
small_trainz = small_train[small_train$place_id %in% ids$place_id,]
plot_ly(data = small_trainz, x = x , y = y, z = hour, color = place_id, type = "scatter3d", mode = "markers", marker=list(size= 5)) %>% layout(title = "Place_id's by position and Time of Day")
z=week
plot_ly(data = small_trainz, x = x , y = y, z = weekday, color = place_id, type = "scatter3d", mode = "markers", marker=list(size= 5)) %>% layout(title = "Place_id's by position and Day of Week")
Count unique place_id
length(unique(small_train$place_id))
## [1] 770
Ignore fewer place_id
small_train %>% count(place_id) %>% filter(n > 3) -> ids
small_train = small_train[small_train$place_id %in% ids$place_id,]
summary(small_train)
## row_id x y accuracy
## Min. : 600 Min. :1.000 Min. :2.500 Min. : 1.00
## 1st Qu.: 7327896 1st Qu.:1.049 1st Qu.:2.575 1st Qu.: 24.00
## Median :14412209 Median :1.123 Median :2.644 Median : 62.00
## Mean :14500685 Mean :1.123 Mean :2.633 Mean : 80.27
## 3rd Qu.:21625800 3rd Qu.:1.191 3rd Qu.:2.688 3rd Qu.: 75.00
## Max. :29112154 Max. :1.250 Max. :2.750 Max. :1000.00
## time place_id hour weekday
## Min. : 203 Length:15595 Min. : 0.000 Min. :0.000694
## 1st Qu.:163173 Class :character 1st Qu.: 6.683 1st Qu.:1.767014
## Median :371294 Mode :character Median :11.933 Median :3.287500
## Mean :366123 Mean :12.017 Mean :3.422029
## 3rd Qu.:565108 3rd Qu.:17.533 3rd Qu.:5.152431
## Max. :729969 Max. :23.983 Max. :6.999306
## month year day
## Min. : 0.000532 Min. :0.0003862 Min. : 0.5884
## 1st Qu.: 1.846852 1st Qu.:0.3104509 1st Qu.: 472.9652
## Median : 3.725116 Median :0.7064193 Median :1076.2145
## Mean : 4.589201 Mean :0.6965804 Mean :1061.2251
## 3rd Qu.: 7.300185 3rd Qu.:1.0751684 3rd Qu.:1637.9957
## Max. :11.999907 Max. :1.3888299 Max. :2115.8522
s = 2
l = 125
w = 500
create_matrix = function(train) {
cbind(s*train$y,
train$x,
train$hour/l,
train$weekday/w,
train$year/w,
train$month/w,
train$time/(w*60*24*7))
}
X = create_matrix(small_train)
X_val = create_matrix(small_val)
KNN
model_knn = FNN::knn(train = X, test = X_val, cl = small_train$place_id, k = 15)
preds <- as.character(model_knn)
truth <- as.character(small_val$place_id)
mean(truth == preds)
## [1] 0.5151964
head(X)
## [,1] [,2] [,3] [,4] [,5] [,6]
## [1,] 5.4046 1.2214 0.077333333 0.006805556 0.0002487823 0.003026852
## [2,] 5.3100 1.1935 0.160266667 0.009669444 0.0015223820 0.018522315
## [3,] 5.2148 1.1452 0.120400000 0.001254167 0.0019596005 0.023841806
## [4,] 5.2524 1.1140 0.008933333 0.006093056 0.0005536796 0.006736435
## [5,] 5.0006 1.1449 0.135600000 0.005412500 0.0012038699 0.014647083
## [6,] 5.0374 1.2015 0.128266667 0.007336111 0.0026283181 0.007977870
## [,7]
## [1,] 0.01297222
## [2,] 0.07938135
## [3,] 0.10217917
## [4,] 0.02887044
## [5,] 0.06277321
## [6,] 0.13704802
Random Forest
set.seed(131L)
small_train$place_id <- as.factor(small_train$place_id) # ranger needs factors for classification
model_rf <- ranger(place_id ~ x + y + accuracy + hour + weekday + month + year,
small_train,
num.trees = 100,
write.forest = TRUE,
importance = "impurity")
## Growing trees.. Progress: 64%. Estimated remaining time: 17 seconds.
pred = predict(model_rf, small_val)
pred = pred$predictions
accuracy = mean(pred == small_val$place_id)
accuracy
## [1] 0.5507784
Visualize RF accuracy It does seem that the correctly identified check-ins are more “clustered” while the wrongly identified ones are more uniformly distributed but other than that no clear patters here.
small_val$Correct = (pred == small_val$place_id)
ggplot(small_val, aes(x, y )) +
geom_point(aes(color = Correct)) +
theme_minimal() +
scale_color_brewer(palette = "Set1")
look at what kind of id’s our random forest gets wrong We see below that our model is doing actually really great on the more popular id’s(more blue on the right). However it loses when it looks at id’s that appear only a few times.
#reordering the levels based on counts:
small_val$place_id <- factor(small_val$place_id,
levels = names(sort(table(small_val$place_id), decreasing = TRUE)))
small_val %>%
ggplot(aes(x = place_id)) + geom_bar(aes(fill = Correct)) +
theme_minimal() +
theme(axis.text.x = element_blank()) +
ggtitle("Prediction Accuracy by ID and Popularity") +
scale_fill_brewer(palette = "Set1")
importance of our variables 1. y variable is more important than the x This means that the y axis is a better predictior of place_id and the random forest figures this out on its own. 2. hour and other time features are also good predictiors but less so than the spatial features - this makes sense since the location of a check-in should be more important than the time of the check-in. 3. Accuracy is a bit misterious since we don’t get an actual definition for it, but at least the model tells us it’s somewhat important.
data.frame(as.list(model_rf$variable.importance)) %>% gather() %>%
ggplot(aes(x = reorder(key, value), y = value)) +
geom_bar(stat = "identity", width = 0.6, fill = "grey") +
coord_flip() +
theme_minimal() +
ggtitle("Variable Importance (Gini Index)") +
theme(axis.title.y = element_blank())